//-----------------------------------------------------------------------
//
// Gentrification and pioneer businesses
// Kristian Behrens, Brahim Boualam, Julien Martin, and Florian Mayneris
//
// First version: 17/02/2015
// This  version: 10/11/2021
//
// Preparation of the 2021 NETS data for the NY CBSA
//
//-----------------------------------------------------------------------


clear
clear matrix
set more off

// Set paths to fit your local machine
global datapath "/Users/kristianbehrens/Desktop/GENTRI_RESTAT_FINAL"
cd $datapath/data/nets/

// Generate log file
qui cap log close
qui cap log using "$datapath/logs/prepare_data_new_york.log", replace


// Import the raw NETS data as obtained from Walls & Associates
import delimited "$datapath/data/nets/NETS2012_NewYorkCBSA.txt", clear

save "$datapath/temp/nets_new_york.dta", replace


//-----------------------------------------------------------------------
// Step 1: Prepare the 1990-2012 panel of time-varying variables
//-----------------------------------------------------------------------


use "$datapath/temp/nets_new_york.dta", clear

#delimit ;

//Keep only the variables we will use
keep dunsnumber emp* empc* sic* hqduns* fips* paydexmin* paydexmax* dnbrating* 
sales* salesc*;

drop sic8* salesgrowthpeer empc salesgrowth sic2 sic3 sic4 sic6 hqduns fipscounty
emphere empherec sicchange hqdunschange fipschange saleshere salesherec 
fipscounty_first;

local varlist "emp08 sic01 sic02 hqduns01 fips01 fips02 paydexmax08 paydexmin09
paydexmax09 paydexmin10 paydexmax10 paydexmin11 paydexmax11 paydexmin12
paydexmax12 salesc10 sales11 salesc11 sales12 salesc12";

#delimit cr

// Some preliminary data cleaning
// Drop two observations where the data is missing and not numerical/not aligned

gen test = 0

foreach v in `varlist' {
	gen test_temp = regexm(`v', "[a-zA-Z]")
	replace test = test + test_temp
	drop test_temp
}

drop if test > 0 // 2 observations will be dropped 
drop test*

foreach v in `varlist' {
	destring `v', replace
}

save "$datapath/temp/nets_new_york1.dta", replace


// Recast all variable names into a format that can be used for a reshape long
local index00 "00 01 02 03 04 05 06 07 08 09 10 11 12"
local index90 "90 91 92 93 94 95 96 97 98 99"

local variables "emp empc sic hqduns fips paydexmin paydexmax dnbrating sales salesc"

foreach i in `index90' {
	foreach v in `variables' {
		ren `v'`i' `v'19`i'
	}
}

foreach i in `index00' {
	foreach v in `variables' {
		ren `v'`i' `v'20`i'
	}
}

// Reshape the data in long format (using the faster sreshape command)
sreshape long emp empc sic hqduns fips paydexmin paydexmax dnbrating sales salesc, i(dunsnumber) j(year)


// Drop all observations that are completely missing for a year
gen test = (emp == . &  empc == . & sic == . & hqduns == . & fips == . & paydexmin == . & paydexmax == . & dnbrating == "" & sales == . & salesc == .)

drop if test == 1
drop test

//Save temp file
save "$datapath/temp/nets_new_york_long_dropped.dta", replace


//------------------------------------------------------------------------
// Step 2: Prepare the 2012 only data in a separate file; save also the HQ
//         and the companyname data in separate files
//------------------------------------------------------------------------


use "$datapath/temp/nets_new_york.dta", replace


// Drop the variables we won't use
#delimit ;

drop region officer title area phone tradename zip4 hqtradename hqzip4 hqofficer
hqtitle hqarea hqphone sizecat sic2 sic3 sic4 sic6 salesgrowth salesgrowthpeer
moveemp empc movesales movesalesc;

#delimit cr

gen estcat_n = 0
replace estcat_n = 1 if estcat == "Standalone"
replace estcat_n = 2 if estcat == "Branch"
replace estcat_n = 3 if estcat == "Headquarters"
label var estcat_n "1=Alone, 2=Branch, 3=HQ"
drop estcat


recast byte estcat_n

// Clean all the string variables
replace city = trim(city)
recast str20 city

replace company = trim(company)
recast str30 company

replace hqcity = trim(hqcity)
recast str20 hqcity

replace hqcompany = trim(hqcompany)
recast str30 hqcompany

// Drop all the 'first', since we are going to merge with the move files anyway
drop city_first state_first zipcode_first cbsa_first fipscounty_first citycode_first

//Drop the 3-6th SIC codes (most of them missing anyway)
drop sic8_3 sic8_4 sic8_5 sic8_6

drop industry

preserve 
keep dunsnumber company
sort dunsnumber

// This file contains the legal names of the companies 
save "$datapath/results/NewYork/nets_new_york_companyname.dta", replace


restore

preserve
keep hqduns hqcompany hqcity hqstate hqzipcode
sort hqduns
duplicates drop hqduns, force
sort hqduns

drop if hqduns == 0

// This file contains all the informations on the HQs 
save "$datapath/results/NewYork/nets_new_york_HQdetails.dta", replace


restore

drop company hqcompany hqcity hqstate hqzipcode

drop if empherec == "Yes" | empherec == "No "

destring emphere, replace
destring empherec, replace

// Save temp file
save "$datapath/temp/nets_new_york_wide_clean.dta", replace 

preserve

keep dunsnumber latitude longitude levelcode
sort dunsnumber 
save "$datapath/results/NewYork/nets_new_york_coordinates.dta", replace


// Merge the cleaned long and wide files if necessary
// (allows to reconstruct a full cleaned file if needed)

//restore

//use "temp/nets_new_york_long_dropped.dta", clear
//sort dunsnumber
//merge m:1 dunsnumber using "temp/nets_new_york_wide_clean.dta"


//------------------------------------------------------------------------
// Step 3: We now reconstruct the moves to have the geographical structure
//         for each of the years
//------------------------------------------------------------------------


// Import the moves file
import delimited "$datapath/data/nets/Moves2012_NewYorkCBSA.txt", stringcols(_all) clear
destring dunsnumber, replace
ren moveyear year
destring year, replace

keep dunsnumber year originlatitude originlongitude originlevelcode destlatitude destlongitude destlevelcode distance
sort dunsnumber year
save "$datapath/temp/nets_new_york_moves.dta", replace

// Merge with the long file we have created before
use "$datapath/temp/nets_new_york_long_dropped.dta", clear

merge 1:1 dunsnumber year using "$datapath/temp/nets_new_york_moves.dta"
drop if _merge == 2
gen move = 0
replace move = 1 if _merge == 3
drop _merge 

destring originlatitude, replace 
destring originlongitude, replace
destring destlatitude, replace
destring destlongitude, replace
destring distance, replace

bysort dunsnumber: gen cum = sum(move)
egen num_move = sum(move), by(dunsnumber)

drop cum

save "$datapath/temp/nets_new_york_long_moves.dta", replace


// Generate the lat and lon of the moves across time
sort dunsnumber year
bys dunsnumber: g cum = sum(move)
xtset dunsnumber year

replace destlatitude = F.originlatitude if F.move==1 & F.cum==1 & num_move > 0
replace destlongitude = F.originlongitude if F.move==1 & F.cum==1 & num_move > 0

//replace destlevelcode = F.originlevelcode if F.move==1 & F.cum==1 & num_move>0

replace destlevelcode = originlevelcode[_n+1] if move[_n+1]==1 & cum[_n+1]==1 & num_move>0
egen _ = group(destlevelcode)
egen lon = max(destlongitude), by(dunsnumber cum)
egen lat = max(destlatitude), by(dunsnumber cum)
egen destlevelcode_gr = max(_), by(dunsnumber cum)
label var destlevelcode_gr "1:B 2:D 3:S 4:T 5:Z "

drop originlatitude originlongitude originlevelcode destlatitude destlongitude destlevelcode cum _
ren destlevelcode_gr geo_quality

sort dunsnumber

merge m:1 dunsnumber using "$datapath/results/NewYork/nets_new_york_coordinates.dta"


//------------------------------------------------------------------------
// Step 4: Final clean-up and assembly of the 1990-2012 long dataset
//------------------------------------------------------------------------

drop _
egen _ = group(levelcode)

replace lon = longitude if missing(lon)
replace lat = latitude if missing(lat)
replace geo_quality = _ if missing(geo_quality)

drop if missing(lon) | missing(lat) | missing(geo_quality)

drop  latitude longitude levelcode _

sort dunsnumber year

order dunsnumber year sic hqduns fips lat lon geo_quality move num_move distance emp empc sales salesc paydexmin paydexmax dnbrating 

replace distance = 0 if missing(distance)
replace dnbrating = "." if  dnbrating == "-- "
replace dnbrating = trim(dnbrating)
recast str3 dnbrating
compress

drop _

lab var fips "FIPS county code"
lab var empc "0=actual, 1=bottom range, 2=D&B est., 3=Walls est."
lab var salesc "0=actual, 1=bottom range, 2=D&B est., 3=Walls est."
lab var sales "Est. sales in $"
lab var emp "Employment at location"
lab var distance "Distance moved during year"
lab var move "Moved this year"
lab var num_move "Number of moves during whole period"
lab var lat "Latitude in degrees"
lab var lon "Longitude in degrees"
lab var sic "8-digit primary SIC code"
lab var paydexmin "Minimum Paydex score"
lab var paydexmax "Maximum Paydex score"
lab var dnbrating "D&B rating"
lab var hqduns "DunsNumber of headquarter"
lab var year "Year from 1990-2012"

tostring dunsnumber, replace
tostring hqduns, replace

save "$datapath/results/NewYork/nets_new_york_1990_2012.dta", replace


//------------------------------------------------------------------------
// Step 5: Add the NAICS data to the database, since the baseline files
//         all use the SIC classification
//------------------------------------------------------------------------

import delimited "$datapath/data/nets/NAICS2012_NewYorkCBSA.txt", clear

// Rename variables for the reshape
ren naics90 naics1990
ren naics91 naics1991
ren naics92 naics1992
ren naics93 naics1993
ren naics94 naics1994
ren naics95 naics1995
ren naics96 naics1996
ren naics97 naics1997
ren naics98 naics1998
ren naics99 naics1999
ren naics00 naics2000
ren naics01 naics2001
ren naics02 naics2002
ren naics03 naics2003
ren naics04 naics2004
ren naics05 naics2005
ren naics06 naics2006
ren naics07 naics2007
ren naics08 naics2008
ren naics09 naics2009
ren naics10 naics2010
ren naics11 naics2011
ren naics12 naics2012

sreshape long naics, i(dunsnumber) j(year)

tostring dunsnumber, replace

save "$datapath/results/NewYork/nets_new_york_NAICS.dta", replace

// Merge with our master data
use "$datapath/results/nets_NY_1990_2012.dta", clear

merge 1:1 dunsnumber year using "$datapath/results/NewYork/nets_new_york_NAICS.dta"

keep if _merge == 3
drop _merge

lab var naics "6-digit NAICS code"

merge 1:1 dunsnumber year using "$datapath/results/NewYork/duns_idconc_mapping.dta"
keep if _merge == 3
drop _merge

save "$datapath/results/NewYork/nets_new_york_1990_2012.dta", replace

qui cap log close

// All done
// Final file: nets_new_york_1990_2012.dta  with 24,340,909 plant-year observations

exit


